{
 "cells": [
  {
   "cell_type": "markdown",
   "source": [
    "# 处理缺失数据"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-04-28T16:26:23.722427Z",
     "start_time": "2024-04-28T16:26:23.685273100Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "          0         1         2\n",
      "0 -0.991201 -0.356181 -1.507123\n",
      "1  1.000000  2.000000       NaN\n",
      "2       NaN  4.000000       NaN\n",
      "3  1.000000  2.000000  3.000000\n"
     ]
    }
   ],
   "execution_count": 91
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "          0         1         2\n",
      "0 -1.085631  0.997345  0.282978\n",
      "1  1.000000  2.000000       NaN\n",
      "2       NaN  4.000000       NaN\n",
      "3  1.000000  2.000000  3.000000\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "np.random.seed(123)\n",
    "df_data = pd.DataFrame([np.random.randn(3), [1., 2., np.nan],\n",
    "                       [np.nan, 4., np.nan], [1., 2., 3.]])\n",
    "print(df_data.head())"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    },
    "ExecuteTime": {
     "end_time": "2024-05-05T06:36:01.780050300Z",
     "start_time": "2024-05-05T06:36:01.768084Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "outputs": [
    {
     "data": {
      "text/plain": "nan"
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_data.iloc[2,0]"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-05-05T06:36:01.827954800Z",
     "start_time": "2024-05-05T06:36:01.783043Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "       0      1      2\n",
      "0  False  False  False\n",
      "1  False  False   True\n",
      "2   True  False   True\n",
      "3  False  False  False\n"
     ]
    }
   ],
   "source": [
    "#isnull来判断是否有空的数据\n",
    "print(df_data.isnull())"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    },
    "ExecuteTime": {
     "end_time": "2024-05-05T06:36:01.842881700Z",
     "start_time": "2024-05-05T06:36:01.799998100Z"
    }
   }
  },
  {
   "cell_type": "markdown",
   "source": [
    "### 删除缺失数据"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-04-28T16:17:10.218427Z",
     "start_time": "2024-04-28T16:17:10.196180Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "          0         1         2\n",
      "0  0.560802  0.246639 -0.011426\n",
      "1  1.000000  2.000000       NaN\n",
      "3  1.000000  2.000000  3.000000\n"
     ]
    }
   ],
   "execution_count": 69
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "          0         1         2\n",
      "0 -1.085631  0.997345  0.282978\n",
      "1  1.000000  2.000000       NaN\n",
      "3  1.000000  2.000000  3.000000\n"
     ]
    }
   ],
   "source": [
    "#默认一个样本，任何一个特征缺失，就删除\n",
    "#inplace True是修改的是原有的df\n",
    "#subset=[0]是指按第一列来删除,第一列有空值就删除对应的行\n",
    "print(df_data.dropna(subset=[0]))\n",
    "# df_data"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    },
    "ExecuteTime": {
     "end_time": "2024-05-05T06:36:01.843879Z",
     "start_time": "2024-05-05T06:36:01.817950200Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "outputs": [
    {
     "data": {
      "text/plain": "          0         1         2\n0 -1.085631  0.997345  0.282978\n1  1.000000  2.000000       NaN\n2       NaN  4.000000       NaN\n3  1.000000  2.000000  3.000000",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>0</th>\n      <th>1</th>\n      <th>2</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>-1.085631</td>\n      <td>0.997345</td>\n      <td>0.282978</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>1.000000</td>\n      <td>2.000000</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>NaN</td>\n      <td>4.000000</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>1.000000</td>\n      <td>2.000000</td>\n      <td>3.000000</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_data"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-05-05T06:36:01.843879Z",
     "start_time": "2024-05-05T06:36:01.832909400Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "          1\n",
      "0  0.997345\n",
      "1  2.000000\n",
      "2  4.000000\n",
      "3  2.000000\n"
     ]
    }
   ],
   "source": [
    "#用的不多，某个特征缺失太多时\n",
    "print(df_data.dropna(axis=1))  #某列有nan就删除该列"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    },
    "ExecuteTime": {
     "end_time": "2024-05-05T06:36:01.917681400Z",
     "start_time": "2024-05-05T06:36:01.850860500Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "          0         1         2\n",
      "0 -1.085631  0.997345  0.282978\n",
      "3  1.000000  2.000000  3.000000\n"
     ]
    }
   ],
   "source": [
    "# 只要样本中有nan就删除该样本\n",
    "print(df_data.dropna())"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-05-05T06:36:06.181516200Z",
     "start_time": "2024-05-05T06:36:06.166787800Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "outputs": [
    {
     "data": {
      "text/plain": "          0         1         2\n0 -1.085631  0.997345  0.282978\n1  1.000000  2.000000       NaN\n2       NaN  4.000000       NaN\n3  1.000000  2.000000  3.000000",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>0</th>\n      <th>1</th>\n      <th>2</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>-1.085631</td>\n      <td>0.997345</td>\n      <td>0.282978</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>1.000000</td>\n      <td>2.000000</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>NaN</td>\n      <td>4.000000</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>1.000000</td>\n      <td>2.000000</td>\n      <td>3.000000</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_data"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-05-05T06:36:41.914618500Z",
     "start_time": "2024-05-05T06:36:41.882843800Z"
    }
   }
  },
  {
   "cell_type": "markdown",
   "source": [
    "### 填充缺失数据"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-04-28T16:31:52.151750800Z",
     "start_time": "2024-04-28T16:31:52.116781200Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0   -0.230185\n",
      "1         1.0\n",
      "2           a\n",
      "3         1.0\n",
      "Name: 0, dtype: object\n",
      "--------------------------------------------------\n",
      "            0         1           2\n",
      "0   -0.230185  1.034534   -0.034387\n",
      "1    1.000000  2.000000 -100.000000\n",
      "2 -100.000000  4.000000 -100.000000\n",
      "3    1.000000  2.000000    3.000000\n",
      "--------------------------------------------------\n",
      "          0         1         2\n",
      "0 -0.230185  1.034534 -0.034387\n",
      "1  1.000000  2.000000       NaN\n",
      "2       NaN  4.000000       NaN\n",
      "3  1.000000  2.000000  3.000000\n"
     ]
    }
   ],
   "execution_count": 117
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0     -1.085631\n",
      "1      1.000000\n",
      "2   -100.000000\n",
      "3      1.000000\n",
      "Name: 0, dtype: float64\n"
     ]
    },
    {
     "data": {
      "text/plain": "          0         1         2\n0 -1.085631  0.997345  0.282978\n1  1.000000  2.000000       NaN\n2       NaN  4.000000       NaN\n3  1.000000  2.000000  3.000000",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>0</th>\n      <th>1</th>\n      <th>2</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>-1.085631</td>\n      <td>0.997345</td>\n      <td>0.282978</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>1.000000</td>\n      <td>2.000000</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>NaN</td>\n      <td>4.000000</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>1.000000</td>\n      <td>2.000000</td>\n      <td>3.000000</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#给零列的空值填为-100，按特征（按列）去填充\n",
    "print(df_data.iloc[:,0].fillna(-100.))\n",
    "df_data"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    },
    "ExecuteTime": {
     "end_time": "2024-05-03T16:03:21.224723700Z",
     "start_time": "2024-05-03T16:03:21.164301100Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "RangeIndex(start=0, stop=3, step=1)\n",
      "--------------------------------------------------\n",
      "0   -1.085631\n",
      "1    1.000000\n",
      "2         NaN\n",
      "3    1.000000\n",
      "Name: 0, dtype: float64\n",
      "0    0.997345\n",
      "1    2.000000\n",
      "2    4.000000\n",
      "3    2.000000\n",
      "Name: 1, dtype: float64\n",
      "0    0.282978\n",
      "1         NaN\n",
      "2         NaN\n",
      "3    3.000000\n",
      "Name: 2, dtype: float64\n"
     ]
    }
   ],
   "source": [
    "print(df_data.columns)\n",
    "print(\"-\"*50)\n",
    "\n",
    "# 依次拿到每一列\n",
    "for i in df_data.columns:\n",
    "    print(df_data.loc[:,i])"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    },
    "ExecuteTime": {
     "end_time": "2024-05-03T16:04:50.329909200Z",
     "start_time": "2024-05-03T16:04:50.288337800Z"
    }
   }
  },
  {
   "cell_type": "markdown",
   "source": [],
   "metadata": {
    "collapsed": false
   }
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
